source("config.R")
## Loading required package: pacman

installs and loads necessary packages

Let’s start with two data sets that have home price information by Zip Code from Zillow as well as a little U.S. Census Bureau data. More specifically, there’s median single-family home value in one and median price per square foot in the other, by Zip Code in 4 eastern Massachusetts counties: Suffolk (mostly Boston), Middlesex (just to the west), Essex (just to the north), and Norfolk (Brookline & others)

I’ll read them in with the readr package, which is good at keeping zip codes as character strings and not numbers (as numbers, here in New England the initial 0 gets stripped out)

price_median <- readr::read_csv("data/zillow_data_median_sf_price.csv")
## Parsed with column specification:
## cols(
##   ZipCode = col_character(),
##   City = col_character(),
##   County = col_character(),
##   TotalPopulation = col_double(),
##   MedianSFPrice = col_double(),
##   PctNonLatinxWhite = col_double()
## )
price_sqft <- readr::read_csv("data/zillow_data_price_per_sq_ft.csv")
## Parsed with column specification:
## cols(
##   ZipCode = col_character(),
##   City = col_character(),
##   County = col_character(),
##   TotalPopulation = col_double(),
##   PricePerSqFt = col_double(),
##   PctNonLatinxWhite = col_double()
## )

Before doing any analysis, it’s always good to take a few basic steps to get familiar with the data set. Let’s look at the price_median data set.

head(price_median)
## # A tibble: 6 x 6
##   ZipCode City     County    TotalPopulation MedianSFPrice PctNonLatinxWhi~
##   <chr>   <chr>    <chr>               <dbl>         <dbl>            <dbl>
## 1 01431   Ashby    Middlesex            3195        271400             94.9
## 2 01432   Ayer     Middlesex            7703        354700             80  
## 3 01450   Groton   Middlesex           11282        497500             92.5
## 4 01460   Littlet~ Middlesex            9754        479900             89  
## 5 01463   Peppere~ Middlesex           12049        359400             91.8
## 6 01464   Shirley  Middlesex            7572        347100             68.4
tail(price_median) # make sure there's no total row at the end to screw things up
## # A tibble: 6 x 6
##   ZipCode City      County   TotalPopulation MedianSFPrice PctNonLatinxWhi~
##   <chr>   <chr>     <chr>              <dbl>         <dbl>            <dbl>
## 1 02481   Wellesley Norfolk            16309       1374500             80.3
## 2 02482   Wellesley Norfolk            11015       1190300             78.6
## 3 02492   Needham   Norfolk            20128        968400             87.6
## 4 02493   Weston    Middles~           12027       1536800             79.7
## 5 02494   Needham   Norfolk            10301        871300             78.1
## 6 02762   Plainvil~ Norfolk             9000        387000             95.3

Check out the structure

str(price_median)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 173 obs. of  6 variables:
##  $ ZipCode          : chr  "01431" "01432" "01450" "01460" ...
##  $ City             : chr  "Ashby" "Ayer" "Groton" "Littleton" ...
##  $ County           : chr  "Middlesex" "Middlesex" "Middlesex" "Middlesex" ...
##  $ TotalPopulation  : num  3195 7703 11282 9754 12049 ...
##  $ MedianSFPrice    : num  271400 354700 497500 479900 359400 ...
##  $ PctNonLatinxWhite: num  94.9 80 92.5 89 91.8 68.4 93.8 93.2 79.4 69.7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ZipCode = col_character(),
##   ..   City = col_character(),
##   ..   County = col_character(),
##   ..   TotalPopulation = col_double(),
##   ..   MedianSFPrice = col_double(),
##   ..   PctNonLatinxWhite = col_double()
##   .. )

OR

dplyr::glimpse(price_median)
## Observations: 173
## Variables: 6
## $ ZipCode           <chr> "01431", "01432", "01450", "01460", "01463",...
## $ City              <chr> "Ashby", "Ayer", "Groton", "Littleton", "Pep...
## $ County            <chr> "Middlesex", "Middlesex", "Middlesex", "Midd...
## $ TotalPopulation   <dbl> 3195, 7703, 11282, 9754, 12049, 7572, 7454, ...
## $ MedianSFPrice     <dbl> 271400, 354700, 497500, 479900, 359400, 3471...
## $ PctNonLatinxWhite <dbl> 94.9, 80.0, 92.5, 89.0, 91.8, 68.4, 93.8, 93...
summary(price_median)
##    ZipCode              City              County          TotalPopulation
##  Length:173         Length:173         Length:173         Min.   : 1964  
##  Class :character   Class :character   Class :character   1st Qu.:10676  
##  Mode  :character   Mode  :character   Mode  :character   Median :18697  
##                                                           Mean   :21118  
##                                                           3rd Qu.:28505  
##                                                           Max.   :61375  
##  MedianSFPrice     PctNonLatinxWhite
##  Min.   : 271400   Min.   : 2.60    
##  1st Qu.: 445800   1st Qu.:68.40    
##  Median : 567400   Median :79.70    
##  Mean   : 664831   Mean   :74.45    
##  3rd Qu.: 739900   3rd Qu.:89.40    
##  Max.   :2200600   Max.   :97.70

Some more substantial summaries

skimr::skim(price_median)
Hmisc::describe(price_median)
## price_median 
## 
##  6  Variables      173  Observations
## ---------------------------------------------------------------------------
## ZipCode 
##        n  missing distinct 
##      173        0      173 
## 
## lowest : 01431 01432 01450 01460 01463, highest: 02482 02492 02493 02494 02762
## ---------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      173        0      116 
## 
## lowest : Acton      Amesbury   Andover    Arlington  Ashby     
## highest: Wilmington Winchester Winthrop   Woburn     Wrentham  
## ---------------------------------------------------------------------------
## County 
##        n  missing distinct 
##      173        0        4 
##                                                   
## Value          Essex Middlesex   Norfolk   Suffolk
## Frequency         38        74        38        23
## Proportion     0.220     0.428     0.220     0.133
## ---------------------------------------------------------------------------
## TotalPopulation 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      173        0      173        1    21118    14622     4367     5984 
##      .25      .50      .75      .90      .95 
##    10676    18697    28505    38944    47280 
## 
## lowest :  1964  2270  2812  3155  3195, highest: 53873 55074 56541 59779 61375
## ---------------------------------------------------------------------------
## MedianSFPrice 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      173        0      171        1   664831   347931   335340   358840 
##      .25      .50      .75      .90      .95 
##   445800   567400   739900  1144500  1365740 
## 
## lowest :  271400  279100  283500  293600  298200
## highest: 1788600 1919400 1987800 2122600 2200600
## ---------------------------------------------------------------------------
## PctNonLatinxWhite 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      173        0      154        1    74.45     21.2    29.62    46.18 
##      .25      .50      .75      .90      .95 
##    68.40    79.70    89.40    92.96    95.34 
## 
## lowest :  2.6  4.2 10.3 12.0 16.4, highest: 96.6 96.9 97.2 97.4 97.7
## ---------------------------------------------------------------------------

Visualizations make it much easier to see things like distributions and outliers.

Don’t know what type of visualization to use for exploring the data you have? https://www.data-to-viz.com gives you suggestions based on the type of data, plus code and advice. Super useful.

Let’s first look at just current median price. One numerical. We can do a histogram.

I’m going to show you both base R built-in graphics and the most popular R package for visualization, widely used among journalists and others, called ggplot2. It’s mostly a matter of personal preference which you want to use. I use both, sometimes base R for ‘quick and dirty’ and ggplot when I want to do more customization.

Quick base R histogram – hist() function. It will try to guess good breaks for your bars.

hist(price_median$MedianSFPrice)

With headline and better x axis label. main for headline, xlab for x axis

hist(price_median$MedianSFPrice , main = "Zillow median home prices, March 2019", xlab = "Price")

ggplot2 is a little more involved.

The first line only sets up what the data frame is, what the x and y values should be, and if you want to group by a category or color-fill by a category. The first line doesn’t actually display anything.

The second line says what kind of viz you want. Syntax is

ggplot(mydata, aes(x = xcolumn, y = ycolumn)) +
     geom_viztype()

In this case we only have one value so don’t need to specify x and y

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot2’s histogram defaults to 30 bins but scolds you to pick something with better intention. Although that looks decent to me, we can try out some others.

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000)

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 100000)

There is a LOT more customizing you can do. Here I’ll make the bars blue outlined in white.

# Change the bar fill color and outline color
ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")

I’ll also add a headline and change the default X axis label with ggtitle and xlab.

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000, fill = "darkblue", color = "white") +
  ggtitle("Zillow median home prices, March 2019") +
  xlab("Price") 

Add a headline and x-axis label, change the theme

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
  ggtitle("Zillow median home prices, March 2019") +
  xlab("Price") +
  theme_light()

Add dollar signs and commas to the axis

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
  ggtitle("Zillow median home prices, March 2019") +
  xlab("Price") +
  theme_light() +
  scale_x_continuous(labels = dollar)

You can play with lots of themes and the ggthemes package :-)

ggplot(price_median, aes(MedianSFPrice)) +
  geom_histogram(binwidth = 50000, fill = "darkblue", color = "white")+
  labs(title = "Median Home Prices, March 2019",
       subtitle = "Source: Zillow Mass. data from Suffolk, Middlesex, Essex, and Norfolk counties "
       ) +
  xlab("Price") +
  scale_x_continuous(labels = dollar) +
  ggthemes::theme_economist() 

But back to our data. Let’s take another look at distribution, this time with a box plot

Base R

boxplot(price_median$MedianSFPrice)

Lots of expensive outliers

ggplot2 - here I use color suggestions from Data to Viz’s R graph gallery.

Since I don’t have an x column but boxplot looks for one, I’m using an empty character string for that and the label

ggplot(price_median, aes(x = "", y = MedianSFPrice)) +
  geom_boxplot(fill="slateblue", alpha=0.2) +
  xlab("")

Again I add dollar sign and commas to make y axis easier to read

ggplot(price_median, aes(x = "", y = MedianSFPrice)) +
  geom_boxplot(fill="slateblue", alpha=0.2) +
  xlab("") +
  scale_y_continuous(labels = dollar)

A bit easier to see those expensive outliers.

Where ggplot also comes in handy is if we’d like to create multiple boxplots by a category, such as one box plot per county. All we need to do as put county on the X axis!

ggplot(price_median, aes(x = County, y = MedianSFPrice)) +
  geom_boxplot(fill="slateblue", alpha=0.2) 

That’s a bit of a different look.

What about Boston, Cambridge, and Somerville? Let’s filter our data set for just those 2 cities:

bos_cam_som <- filter(price_median, City %in% c("Boston", "Cambridge", "Somerville"))

ggplot(bos_cam_som, aes(x = City, y = MedianSFPrice)) +
  geom_boxplot(fill="slateblue", alpha=0.2) 

Well that’s interesting.

Now let’s take a look at median price per square foot in Boston, Cambridge, and Somerville.

bos_cam_som_per_sq_ft <- filter(price_sqft, City %in% c("Boston", "Cambridge", "Somerville"))

ggplot(bos_cam_som_per_sq_ft, aes(x = City, y = PricePerSqFt)) +
  geom_boxplot(fill="slateblue", alpha=0.2) 

That is incredibly different. Remember that this is looking at ONE MEDIAN PER ZIP CODE, not individual prices, so it’s looking at DIFFERENCES IN NEIGHBORHOOD MEDIANS. Looks like there may be a much greater difference in median home sizes by Zip Code in Boston. Also look at those two extremely pricey Zip Code outliers in Boston.

Median price vs percent white?

scatterplot

base R

plot(price_median$PctNonLatinxWhite, price_median$MedianSFPrice)

Could there be relationship? It’s possible to make this base R look better and add a regression line, but I tend to do that work in ggplot.

Remember, first line of code sets up the data set, x values and y values. Second line says what type of “geom” viz type you want.

ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice)) +
  geom_point() 

If you’d like to see which dots are what counties, you can color by county:

ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
  geom_point() 

Things are a little hard to see with the grey background, I’ll change the theme to get rid of the grid lines altogether and bump up the dot size a bit.

ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
  geom_point(size = 2) +
  theme_few() # named for Stephen Few, well known in the data viz field

It would be cool to be able to mouse over the dots and see underlying data, including the city. The plotly package turns a ggplot static plot into a JavaScript one! Just wrap your code in the ggplotly function.

plotly::ggplotly(
  ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County)) +
    geom_point(size = 2) +
    theme_few() # named for Stephen Few, well known in the data viz field
)

I can’t tell you why, but the syntax for adding fields to the tooltip is something like

text = paste("mycolumn:", mycolumn)

ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
  geom_point(size = 2) +
  theme_few() # named for Stephen Few, well known in the data viz field

plotly::ggplotly(
  ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
    geom_point(size = 2) +
    theme_few() # named for Stephen Few, well known in the data viz field
)

Double click on the legend to just look at one county

If you want to learn more about plotly and R, check out:
https://plot.ly/r/ https://plotly-r.com/overview.html

Note that you can store a graph in a variable, just like any other value

mygraph <- ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice, color = County, text = paste("City:", City))) +
  geom_point(size = 2) +
  theme_few() # named for Stephen Few, well known in the data viz field
mygraph

plotly::ggplotly(
  mygraph
)

Note also you can just use ggplotly() if plotly is loaded in your working session.

I added the plotly:: so you know which package that function comes from.

To add a regression line, lm for linear model. We want to look at all points together, not by county, so back to

mygraph <- ggplot(price_median, aes(PctNonLatinxWhite, MedianSFPrice)) +
  geom_point() +
  theme_minimal()

mygraph

Add linear regression line:

mygraph +
  geom_smooth(method = lm)

This includes a 95% confidence interval. Not much of a relationship, even when considering the confidence interval. To take out the shaded area, se = FALSE.

mygraph +
  geom_smooth(method = lm, se = FALSE)

Let R pick with a method of “auto”:

mygraph +
  geom_smooth(method = "auto", se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Now let’s look just at Boston

boston <- filter(price_median, City == "Boston")

ggplot(boston, aes(PctNonLatinxWhite, MedianSFPrice)) +
  geom_point(size = 2) +
  theme_few() +
  geom_smooth(method = lm)

This looks like perhaps more of a relationship Which makes sense, since Boston is generally more expensive than outer suburbs, and many outer suburbs are heavily white. Considering just Boston takes out one important pricing factor: Distance from Boston.

You can see the statistics behind the correlation with cor.test()

cor.test(boston$PctNonLatinxWhite, boston$MedianSFPrice)
## 
##  Pearson's product-moment correlation
## 
## data:  boston$PctNonLatinxWhite and boston$MedianSFPrice
## t = 2.2073, df = 18, p-value = 0.04051
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02390675 0.75073338
## sample estimates:
##       cor 
## 0.4615447

A useful add-on R package for correlations is corr.

It’s still not incredibly strong. One reason could be that Zip Codes don’t necessarily correlate with neighborhoods. However, another reason may be that those two high-priced outliers are affecting the trend. What does it look like for areas where the median price is under a million dollars?

boston2 <- filter(boston, MedianSFPrice < 1000000)

ggplot(boston2, aes(PctNonLatinxWhite, MedianSFPrice)) +
  geom_point(size = 2) +
  theme_few() +
  geom_smooth(method = lm)

That’s pretty dramatic! By the way, you can get rid of the scientific notation with

options(scipen = 999)

I’ll also put dollar signs on the y axis

ggplot(boston2, aes(PctNonLatinxWhite, MedianSFPrice)) +
  geom_point(size = 2) +
  theme_few() +
  scale_y_continuous(labels = dollar) +
  geom_smooth(method = lm)

Now look at the correlation:

cor.test(boston2$PctNonLatinxWhite, boston2$MedianSFPrice)
## 
##  Pearson's product-moment correlation
## 
## data:  boston2$PctNonLatinxWhite and boston2$MedianSFPrice
## t = 5.7302, df = 16, p-value = 0.00003099
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5721415 0.9305963
## sample estimates:
##       cor 
## 0.8199816

There’s much more to regression than this, but we don’t have time to go into it. If you’re interested, you can see a nice tutorial by Kaggle data scientist Rachael Tatmen at https://www.kaggle.com/rtatman/regression-challenge-day-1.

Our last viz will be a bar plot. Let’s look at median prices for Boston, Cambridge, Somerville, Brookline, and Everett. We’ll read in a new data set:

cities <- readr::read_csv("data/zillow_median_sfhome_by_city.csv") %>%
  filter(City %in% c("Boston", "Cambridge", "Somerville", "Brookline", "Everett"))
## Parsed with column specification:
## cols(
##   City = col_character(),
##   Metro = col_character(),
##   County = col_character(),
##   SizeRank = col_double(),
##   Price = col_double()
## )

Here’s the base R version, although I tend not to use this for bar charts

barplot(names = cities$City, height = cities$Price)

ggplot makes some nice bar plots, although it can be difficult for me to remember my specific customizations. So, I’ve saved my favorite look as an Rstudio “code snippet”

If we have time, I’ll show you a ggplot code snippet.

First, go to Tools > Global Options > Code. At the bottom, make sure “Enable code snippets” is checked.

Next, run

usethis::edit_rstudio_snippets()

In the file that’s open, add the following somewhere. Make sure the 2nd and third lines start off with a tab indenting them.

snippet my_barplot
  ggplot(${1:mydataframe}, aes(x=${2:myxcolname}, y=${3:myycolname})) + 
  geom_bar(stat="identity", color = "black", fill="#0072B2")

Save and close the r.snippets file.

Now start typing my_barplot somewhere in your upper left RStudio pane and when you see the my_barplot dropdown, hit the tab key and see what happens. Fill in the name of mydataframe, then hit tab and fill in the name of the x column, and again fill in the y column.

ggplot(cities, aes(x=City, y=Price)) + 
  geom_col(color = "black", fill="#0072B2")

It can be helpful for the viewer to re-order the bars from large to small.

ggplot(cities, aes(x=reorder(City, -Price), y=Price)) + 
  geom_col(color = "black", fill="#0072B2")  

See how to do RStudio code snippets in one of my Do More With R video screencasts: https://youtu.be/h_i__VTSurU.

A couple of other general R resources that might be useful: